
TED talks are video recordings of influential talks given at and hosted by TED Conferences LLC. TED was founded in 1984, and has since built a reputation for spreading inspiring powerful ideas in fields ranging from tech to science to education. As video recordings of TED talks have garnered over 1 billion views to date, it is evident that TED represents a significant platform and opportunity for anyone with a powerful mission to raise awareness and attention to their work.
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
# Common imports
import numpy as np
import os
import pandas as pd
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
pd.options.mode.chained_assignment = None # default='warn'
#display all columns
pd.set_option('display.max_columns', None)
import pandas as pd
import requests
import io
# Downloading the csv file from my GitHub account
url = "https://raw.githubusercontent.com/McGill-MMA-EnterpriseAnalytics/TED/main/data/ted_main.csv"
download = requests.get(url).content
# Reading the downloaded content and turning it into a pandas dataframe
# We will use "churn" instead of "df" for replication purposes
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
# Printing out the first 5 rows of the dataframe
df.head()
df.info()
# Reorganize the columns for a better visualization
df = df[['name', 'title', 'description', 'main_speaker', 'speaker_occupation', 'num_speaker', 'duration', 'event', 'film_date', 'published_date', 'comments', 'tags', 'languages', 'ratings', 'related_talks', 'url', 'views']]
df.head()
# Some more information about the dataset
display(df.shape)
display(df.isnull().sum())
display(df.describe())
speaker_occupation column have 6 missing values.# working on df_copy for the rest of data exploration
df_copy = df.copy()
# Filtering out the TED talks
df_copy = df_copy[df_copy['event'].str.contains('TED', regex=False, case=False, na=False)]
df_copy.shape
# Converting Duration to minutes
df_copy["duration"] = round(df_copy["duration"]/60,2)
df_copy.fillna('Unknown', inplace = True)
df_copy['languages'].describe()
Observations with zero languages are musicals
df_copy[df_copy['languages'] == 0].head(3)
#Indexing the rows
df_copy.reset_index(inplace=True)
from collections import defaultdict
rating_data = defaultdict(list)
import ast
rating_names = set()
for index, row in df_copy.iterrows():
rating = ast.literal_eval(row['ratings'])
for item in rating:
rating_names.add(item['name'])
rating_names
#Extracting ratings
rating_data = defaultdict(list)
for index, row in df_copy.iterrows():
rating = ast.literal_eval(row['ratings'])
rating_data['ID'].append(row['index'])
names = set()
for item in rating:
rating_data[item['name']].append(item['count'])
names.add(item['name'])
rating_data = pd.DataFrame(rating_data)
rating_data.head()
# Extracting tags
tags_data = defaultdict(list)
for index, row in df_copy.iterrows():
tags = ast.literal_eval(row['tags'])
for item in tags:
tags_data['ID'].append(row['index'])
tags_data['tags'].append(item)
tags_data = pd.DataFrame(tags_data)
tags_data[tags_data['ID']==1]
# Extracting related talks
df_copy['related_views'] = 0
df_copy['related_duration'] = 0
for index, row in df_copy.iterrows():
rel = row['related_talks'].split(',')
ctr1 = 0
tot1 = 0
ctr2 = 0
tot2 = 0
for views in rel:
if 'viewed_count' in views:
view = views.split(':')
view[1] = view[1].replace("]", "")
view[1] = view[1].replace(" ", "")
view[1] = view[1].replace("}", "")
tot1+=int(view[1])
ctr1+=1
if 'duration' in views:
view = views.split(':')
view[1] = view[1].replace("]", "")
view[1] = view[1].replace(" ", "")
view[1] = view[1].replace("}", "")
tot2+=int(view[1])
ctr2+=1
df_copy['related_views'][index] = tot1/ctr1
df_copy['related_duration'][index] = tot2/ctr2
df_copy.head(3)
df_copy['event_category'] = 'Other'
for i in range(len(df_copy)):
if df_copy['event'][i][0:5]=='TED20':
df_copy['event_category'][i] = 'TED2000s'
elif df_copy['event'][i][0:5]=='TED19':
df_copy['event_category'][i] = 'TED1900s'
elif df_copy['event'][i][0:4]=='TEDx':
df_copy['event_category'][i] = "TEDx"
elif df_copy['event'][i][0:7]=='TED@BCG':
df_copy['event_category'][i] = 'TED@BCG'
elif df_copy['event'][i][0:4]=='TED@':
df_copy['event_category'][i] = "TED@"
elif df_copy['event'][i][0:8]=='TEDSalon':
df_copy['event_category'][i] = "TEDSalon"
elif df_copy['event'][i][0:9]=='TEDGlobal':
df_copy['event_category'][i] = 'TEDGlobal'
elif df_copy['event'][i][0:8]=='TEDWomen':
df_copy['event_category'][i] = 'TEDWomen'
elif df_copy['event'][i][0:6]=='TEDMED':
df_copy['event_category'][i] = 'TEDMED'
elif df_copy['event'][i][0:3]=='TED':
df_copy['event_category'][i] = 'TEDOther'
# Convert timestamp into readable format
import datetime
df_copy['published_date'] = df_copy['published_date'].apply(lambda x: datetime.date.fromtimestamp(int(x)))
df_copy['day'] = df_copy['published_date'].apply(lambda x: x.weekday())
df_copy['month'] = df_copy['published_date'].apply(lambda x: x.month)
df_copy['year'] = df_copy['published_date'].apply(lambda x: x.year)
df_copy['film_date'] = df_copy['film_date'].apply(lambda x: datetime.date.fromtimestamp(int(x)))
df_copy['day_film'] = df_copy['film_date'].apply(lambda x: x.weekday())
df_copy['month_film'] = df_copy['film_date'].apply(lambda x: x.month)
df_copy['year_film'] = df_copy['film_date'].apply(lambda x: x.year)
to_cat = {"day": {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thurday", 4: "Friday", 5: "Saturday",
6: "Sunday" },
"day_film": {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thurday", 4: "Friday", 5: "Saturday",
6: "Sunday" }}
df_copy.replace(to_cat, inplace=True)
#create new attributes for length of the title, description, and for number of times the speaker spoke
df_copy['title_len'] = df_copy['title'].str.len()
df_copy['description_len'] = df_copy['description'].str.len()
df_copy['speaker_frequency'] = df_copy.groupby('main_speaker')['index'].transform('count')
df_copy['repeat_speaker'] = np.where((df_copy['speaker_frequency'] >1),1,0)
temp = tags_data.groupby(['tags']).count()
temp = temp.sort_values(by='ID',ascending=False)
temp.head(3)
# Creating Tag Categories
df_copy['Technology/Science'] = 0
df_copy['Humanity'] = 0
df_copy['Global Issues'] = 0
df_copy['Art/Creativity'] = 0
df_copy['Business'] = 0
df_copy['Entertainment'] = 0
df_copy['Health'] = 0
df_copy['Communication'] = 0
df_copy['Education']=0
Tech = ['technology','future','comuters','science','invention','research']
Humanity = ['community','society','social change','humanity','culture']
Global_Issues = ['global issues','activism','politics','inequality','environment','climate change']
Art = ['design','art','innovation','creativity','brain']
Business = ['business','economics']
Entertainment = ['entertainment','media','sports']
Health = ['health','biollogy','medicine','health care','medical research']
Communication = ['communication','collaboration']
Education = ['children','education','teaching','parenting']
for i in range(len(tags_data)):
index = tags_data['ID'][i]
if tags_data['tags'][i] in Tech:
df_copy['Technology/Science'][index]=1
if tags_data['tags'][i] in Humanity:
df_copy['Humanity'][index]=1
if tags_data['tags'][i] in Global_Issues:
df_copy['Global Issues'][index]=1
if tags_data['tags'][i] in Art:
df_copy['Art/Creativity'][index]=1
if tags_data['tags'][i] in Business:
df_copy['Business'][index]=1
if tags_data['tags'][i] in Entertainment:
df_copy['Entertainment'][index]=1
if tags_data['tags'][i] in Health:
df_copy['Health'][index]=1
if tags_data['tags'][i] in Communication:
df_copy['Communication'][index]=1
if tags_data['tags'][i] in Education:
df_copy['Education'][index]=1
df_copy = df_copy.drop(['index','comments', 'event', 'film_date', 'main_speaker', 'name', 'published_date', 'ratings', 'url', 'description', 'title',
'related_talks', 'tags', 'title','speaker_occupation'], 1)
df_copy.head()
import pandas_profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(df_copy, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile